<!DOCTYPE html>



  


<html class="theme-next gemini use-motion" lang>
<head><meta name="generator" content="Hexo 3.8.0">
  <meta charset="UTF-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1">
<meta name="theme-color" content="#222">









<meta http-equiv="Cache-Control" content="no-transform">
<meta http-equiv="Cache-Control" content="no-siteapp">
















  
  
  <link href="/hclearn/lib/fancybox/source/jquery.fancybox.css?v=2.1.5" rel="stylesheet" type="text/css">




  
  
  
  

  
    
    
  

  
    
      
    

    
  

  

  

  

  
    
    
    <link href="//fonts.googleapis.com/css?family=Lato:300,300italic,400,400italic,700,700italic|Lato:300,300italic,400,400italic,700,700italic&subset=latin,latin-ext" rel="stylesheet" type="text/css">
  






<link href="/hclearn/lib/font-awesome/css/font-awesome.min.css?v=4.6.2" rel="stylesheet" type="text/css">

<link href="/hclearn/css/main.css?v=5.1.4" rel="stylesheet" type="text/css">


  <link rel="apple-touch-icon" sizes="180x180" href="/hclearn/images/apple-touch-icon-next.png?v=5.1.4">


  <link rel="icon" type="image/png" sizes="32x32" href="/hclearn/images/favicon-32x32-next.png?v=5.1.4">


  <link rel="icon" type="image/png" sizes="16x16" href="/hclearn/images/favicon-16x16-next.png?v=5.1.4">


  <link rel="mask-icon" href="/hclearn/images/logo.svg?v=5.1.4" color="#222">





  <meta name="keywords" content="agent,state,env,">





  <link rel="alternate" href="/hclearn/atom.xml" title="HClearn" type="application/atom+xml">






<meta name="description" content="这节课主要介绍强化学习的基本概念，包括 reward, agent, environment, state 等。">
<meta name="keywords" content="agent,state,env">
<meta property="og:type" content="article">
<meta property="og:title" content="Lecture 1. Introduction to Reinforcement Learning">
<meta property="og:url" content="http://guyuecanhui.gitee.io/hclearn/2020/07/19/ucl-rl-01/index.html">
<meta property="og:site_name" content="HClearn">
<meta property="og:description" content="这节课主要介绍强化学习的基本概念，包括 reward, agent, environment, state 等。">
<meta property="og:locale" content="default">
<meta property="og:image" content="http://guyuecanhui.gitee.io/hclearn/2020/07/19/ucl-rl-01/agent_and_env.png">
<meta property="og:image" content="http://guyuecanhui.gitee.io/hclearn/2020/07/19/ucl-rl-01/category_rl_agent.png">
<meta property="og:updated_time" content="2020-07-19T01:09:20.462Z">
<meta name="twitter:card" content="summary">
<meta name="twitter:title" content="Lecture 1. Introduction to Reinforcement Learning">
<meta name="twitter:description" content="这节课主要介绍强化学习的基本概念，包括 reward, agent, environment, state 等。">
<meta name="twitter:image" content="http://guyuecanhui.gitee.io/hclearn/2020/07/19/ucl-rl-01/agent_and_env.png">



<script type="text/javascript" id="hexo.configurations">
  var NexT = window.NexT || {};
  var CONFIG = {
    root: '/hclearn/',
    scheme: 'Gemini',
    version: '5.1.4',
    sidebar: {"position":"left","display":"post","offset":12,"b2t":false,"scrollpercent":false,"onmobile":false},
    fancybox: true,
    tabs: true,
    motion: {"enable":true,"async":false,"transition":{"post_block":"fadeIn","post_header":"slideDownIn","post_body":"slideDownIn","coll_header":"slideLeftIn","sidebar":"slideUpIn"}},
    duoshuo: {
      userId: '0',
      author: '博主'
    },
    algolia: {
      applicationID: '',
      apiKey: '',
      indexName: '',
      hits: {"per_page":10},
      labels: {"input_placeholder":"Search for Posts","hits_empty":"We didn't find any results for the search: ${query}","hits_stats":"${hits} results found in ${time} ms"}
    }
  };
</script>



  <link rel="canonical" href="http://guyuecanhui.gitee.io/hclearn/2020/07/19/ucl-rl-01/">





  <title>Lecture 1. Introduction to Reinforcement Learning | HClearn</title>
  








</head>

<body itemscope itemtype="http://schema.org/WebPage" lang="default">

  
  
    
  

  <div class="container sidebar-position-left page-post-detail">
    <div class="headband"></div>

    <header id="header" class="header" itemscope itemtype="http://schema.org/WPHeader">
      <div class="header-inner"><div class="site-brand-wrapper">
  <div class="site-meta ">
    

    <div class="custom-logo-site-title">
      <a href="/hclearn/" class="brand" rel="start">
        <span class="logo-line-before"><i></i></span>
        <span class="site-title">HClearn</span>
        <span class="logo-line-after"><i></i></span>
      </a>
    </div>
      
        <p class="site-subtitle">Keep hungry, keep foolish</p>
      
  </div>

  <div class="site-nav-toggle">
    <button>
      <span class="btn-bar"></span>
      <span class="btn-bar"></span>
      <span class="btn-bar"></span>
    </button>
  </div>
</div>

<nav class="site-nav">
  

  
    <ul id="menu" class="menu">
      
        
        <li class="menu-item menu-item-home">
          <a href="/hclearn/" rel="section">
            
              <i class="menu-item-icon fa fa-fw fa-home"></i> <br>
            
            首页
          </a>
        </li>
      
        
        <li class="menu-item menu-item-about">
          <a href="/hclearn/about/" rel="section">
            
              <i class="menu-item-icon fa fa-fw fa-user"></i> <br>
            
            关于
          </a>
        </li>
      
        
        <li class="menu-item menu-item-tags">
          <a href="/hclearn/tags/" rel="section">
            
              <i class="menu-item-icon fa fa-fw fa-tags"></i> <br>
            
            标签
          </a>
        </li>
      
        
        <li class="menu-item menu-item-categories">
          <a href="/hclearn/categories/" rel="section">
            
              <i class="menu-item-icon fa fa-fw fa-th"></i> <br>
            
            分类
          </a>
        </li>
      
        
        <li class="menu-item menu-item-archives">
          <a href="/hclearn/archives/" rel="section">
            
              <i class="menu-item-icon fa fa-fw fa-archive"></i> <br>
            
            归档
          </a>
        </li>
      

      
        <li class="menu-item menu-item-search">
          
            <a href="javascript:;" class="popup-trigger">
          
            
              <i class="menu-item-icon fa fa-search fa-fw"></i> <br>
            
            搜索
          </a>
        </li>
      
    </ul>
  

  
    <div class="site-search">
      
  <div class="popup search-popup local-search-popup">
  <div class="local-search-header clearfix">
    <span class="search-icon">
      <i class="fa fa-search"></i>
    </span>
    <span class="popup-btn-close">
      <i class="fa fa-times-circle"></i>
    </span>
    <div class="local-search-input-wrapper">
      <input autocomplete="off" placeholder="搜索..." spellcheck="false" type="text" id="local-search-input">
    </div>
  </div>
  <div id="local-search-result"></div>
</div>



    </div>
  
</nav>



 </div>
    </header>

    <main id="main" class="main">
      <div class="main-inner">
        <div class="content-wrap">
          <div id="content" class="content">
            

  <div id="posts" class="posts-expand">
    

  

  
  
  

  <article class="post post-type-normal" itemscope itemtype="http://schema.org/Article">
  
  
  
  <div class="post-block">
    <link itemprop="mainEntityOfPage" href="http://guyuecanhui.gitee.io/hclearn/hclearn/2020/07/19/ucl-rl-01/">

    <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
      <meta itemprop="name" content="古月残辉">
      <meta itemprop="description" content>
      <meta itemprop="image" content="/hclearn/images/avatar.gif">
    </span>

    <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
      <meta itemprop="name" content="HClearn">
    </span>

    
      <header class="post-header">

        
        
          <h1 class="post-title" itemprop="name headline">Lecture 1. Introduction to Reinforcement Learning</h1>
        

        <div class="post-meta">
          <span class="post-time">
            
              <span class="post-meta-item-icon">
                <i class="fa fa-calendar-o"></i>
              </span>
              
                <span class="post-meta-item-text">发表于</span>
              
              <time title="创建于" itemprop="dateCreated datePublished" datetime="2020-07-19T00:04:31+08:00">
                2020-07-19
              </time>
            

            

            
          </span>

          
            <span class="post-category">
            
              <span class="post-meta-divider">|</span>
            
              <span class="post-meta-item-icon">
                <i class="fa fa-folder-o"></i>
              </span>
              
                <span class="post-meta-item-text">分类于</span>
              
              
                <span itemprop="about" itemscope itemtype="http://schema.org/Thing">
                  <a href="/hclearn/categories/强化学习/" itemprop="url" rel="index">
                    <span itemprop="name">强化学习</span>
                  </a>
                </span>

                
                
              
            </span>
          

          
            
              <span class="post-comments-count">
                <span class="post-meta-divider">|</span>
                <span class="post-meta-item-icon">
                  <i class="fa fa-comment-o"></i>
                </span>
                <a href="/hclearn/2020/07/19/ucl-rl-01/#comments" itemprop="discussionUrl">
                  <span class="post-comments-count valine-comment-count" data-xid="/hclearn/2020/07/19/ucl-rl-01/" itemprop="commentCount"></span>
                </a>
              </span>
            
          

          
          

          
            <span class="post-meta-divider">|</span>
            <span class="page-pv"><i class="fa fa-file-o"></i> 阅读数
            <span class="busuanzi-value" id="busuanzi_value_page_pv"></span>
            </span>
          

          
            <div class="post-wordcount">
              
                
                <span class="post-meta-item-icon">
                  <i class="fa fa-file-word-o"></i>
                </span>
                
                  <span class="post-meta-item-text">字数统计&#58;</span>
                
                <span title="字数统计">
                  1k
                </span>
              

              
                <span class="post-meta-divider">|</span>
              

              
                <span class="post-meta-item-icon">
                  <i class="fa fa-clock-o"></i>
                </span>
                
                  <span class="post-meta-item-text">阅读时长 &asymp;</span>
                
                <span title="阅读时长">
                  6
                </span>
              
            </div>
          

          

        </div>
      </header>
    

    
    
    
    <div class="post-body" itemprop="articleBody">

      
      

      
        <p>这节课主要介绍强化学习的基本概念，包括 reward, agent, environment, state 等。</p>
<a id="more"></a>
<p>What makes reinforcement learning different from other machine learning paradigms?</p>
<ul>
<li>There is no supervisor, only a reward signal </li>
<li>Feedback is delayed, not instantaneous </li>
<li>Time really matters (sequential, non i.i.d data) </li>
<li>Agent’s actions affect the subsequent data it receives</li>
</ul>
<h2 id="Reward"><a href="#Reward" class="headerlink" title="Reward"></a>Reward</h2><p>Reinforcement learning is based on the reward hypothesis:</p>
<div class="note success"><p>All goals can be described by the maximisation of expected cumulative reward </p></div>
<ul>
<li>A reward $R_t$ is a scalar feedback signal </li>
<li>Indicates how well agent is doing at step $t$ </li>
<li>The agent’s job is to maximise cumulative reward</li>
</ul>
<h3 id="Sequential-Decision-Making"><a href="#Sequential-Decision-Making" class="headerlink" title="Sequential Decision Making"></a>Sequential Decision Making</h3><ul>
<li>Goal: select actions to maximise total future reward </li>
<li>Actions may have long term consequences </li>
<li>Reward may be delayed </li>
<li>It may be better to sacrifice immediate reward to gain more long-term reward </li>
<li>Examples: <ul>
<li>A financial investment (may take months to mature) </li>
<li>Refuelling a helicopter (might prevent a crash in several hours) </li>
<li>Blocking opponent moves (might help winning chances many moves from now)</li>
</ul>
</li>
</ul>
<h2 id="Agent-and-Environment"><a href="#Agent-and-Environment" class="headerlink" title="Agent and Environment"></a>Agent and Environment</h2><div style="width:70%; margin:auto">
<img src="/hclearn/2020/07/19/ucl-rl-01/agent_and_env.png" title="agent 和 环境的交互关系">
</div>

<ul>
<li>At each step $t$ the agent: <ul>
<li>Executes action $A_t$ </li>
<li>Receives observation $O_t$ </li>
<li>Receives scalar reward $R_t$ </li>
</ul>
</li>
<li>The environment: <ul>
<li>Receives action $A_t$ </li>
<li>Emits observation $O_{t+1}$ </li>
<li>Emits scalar reward $R_{t+1}$ </li>
</ul>
</li>
<li>$t$ increments at env. step</li>
</ul>
<h2 id="State"><a href="#State" class="headerlink" title="State"></a>State</h2><h3 id="History-and-State"><a href="#History-and-State" class="headerlink" title="History and State"></a>History and State</h3><ul>
<li>The history is the sequence of observations, actions, rewards: <script type="math/tex">H_t = O_1, R_1, A_1, \cdots, A_{t−1}, O_t , R_t</script></li>
<li>i.e. all observable variables up to time $t$ </li>
<li>i.e. the sensorimotor stream of a robot or embodied agent </li>
<li>What happens next depends on the history: <ul>
<li>The agent selects actions </li>
<li>The environment selects observations/rewards </li>
</ul>
</li>
<li>State is the information used to determine what happens next </li>
<li>Formally, state is a function of the history: $S_t = f (H_t)$</li>
</ul>
<h3 id="Environment-State"><a href="#Environment-State" class="headerlink" title="Environment State"></a>Environment State</h3><ul>
<li>The environment state $S^e_t$ is the environment’s private representation </li>
<li>i.e. whatever data the environment uses to pick the next observation/reward </li>
<li>The environment state is not usually visible to the agent </li>
<li>Even if $S^e_t$ is visible, it may contain irrelevant information</li>
</ul>
<h3 id="Agent-State"><a href="#Agent-State" class="headerlink" title="Agent State"></a>Agent State</h3><ul>
<li>The agent state $S^a_t$ is the agent’s internal representation </li>
<li>i.e. whatever information the agent uses to pick the next action </li>
<li>i.e. it is the information used by reinforcement learning algorithms </li>
<li>It can be any function of history: $S^a_t = f (H_t)$</li>
</ul>
<h3 id="Information-State"><a href="#Information-State" class="headerlink" title="Information State"></a>Information State</h3><p>An information state (a.k.a. Markov state) contains all useful information from the history. </p>
<div class="note success"><p>A state <script type="math/tex">S_t</script> is Markov if and only if <script type="math/tex">\mathbb{P}[S_{t+1} | S_t ] = \mathbb{P}[S_{t+1} | S_1,\cdots, S_t ]</script> </p></div>
<ul>
<li>“The future is independent of the past given the present” <script type="math/tex">H_{1:t} \rightarrow S_t \rightarrow H_{t+1:\infty}</script> </li>
<li>Once the state is known, the history may be thrown away </li>
<li>i.e. The state is a sufficient statistic of the future </li>
<li>The environment state $S^e_t$ is Markov </li>
<li>The history $H_t$ is Markov</li>
</ul>
<h3 id="Fully-Observable-Environments"><a href="#Fully-Observable-Environments" class="headerlink" title="Fully Observable Environments"></a>Fully Observable Environments</h3><p>Full observability: agent directly observes environment state $O_t = S^a_t = S^e_t$ </p>
<ul>
<li>Agent state = environment state = information state </li>
<li>Formally, this is a Markov decision process (MDP) </li>
</ul>
<h3 id="Partially-Observable-Environments"><a href="#Partially-Observable-Environments" class="headerlink" title="Partially Observable Environments"></a>Partially Observable Environments</h3><ul>
<li>Partial observability: agent indirectly observes environment: <ul>
<li>A robot with camera vision isn’t told its absolute location </li>
<li>A trading agent only observes current prices </li>
<li>A poker playing agent only observes public cards</li>
</ul>
</li>
<li>Now agent state $\not =$ environment state </li>
<li>Formally this is a partially observable Markov decision process (POMDP) </li>
<li>Agent must construct its own state representation $S^a_t$, e.g. <ul>
<li>Complete history: $S^a_t = H_t$ </li>
<li>Beliefs of environment state: $S^a_t = (\mathbb{P}[S^e_t = s_1 ], …, \mathbb{P}[S^e_t = s_n ]) $</li>
<li>Recurrent neural network: <script type="math/tex">S^a_t = \sigma (S^a_{t−1} W_s + O_tW_o )</script></li>
</ul>
</li>
</ul>
<h2 id="RL-Agent"><a href="#RL-Agent" class="headerlink" title="RL Agent"></a>RL Agent</h2><p>An RL agent may include one or more of these components: </p>
<ul>
<li>Policy: agent’s behaviour function </li>
<li>Value function: how good is each state and/or action </li>
<li>Model: agent’s representation of the environment</li>
</ul>
<h3 id="Policy"><a href="#Policy" class="headerlink" title="Policy"></a>Policy</h3><ul>
<li>A policy is the agent’s behaviour </li>
<li>It is a map from state to action, e.g. <ul>
<li>Deterministic policy: $a = \pi(s)$ </li>
<li>Stochastic policy: $\pi(a|s) = \mathbb{P}[A_t = a|S_t = s]$</li>
</ul>
</li>
</ul>
<h3 id="Value-Function"><a href="#Value-Function" class="headerlink" title="Value Function"></a>Value Function</h3><ul>
<li>Value function is a prediction of future reward </li>
<li>Used to evaluate the goodness/badness of states </li>
<li>And therefore to select between actions, e.g. <script type="math/tex">v_\pi(s) = \mathbb{E}_\pi [R_{t+1} + \gamma R_{t+2} + \gamma^2 R_{t+3} + ... | S_t = s]</script></li>
</ul>
<h3 id="Model"><a href="#Model" class="headerlink" title="Model"></a>Model</h3><ul>
<li><p>A model predicts what the environment will do next </p>
</li>
<li><p>$P$ predicts the next state </p>
</li>
<li><p>$R$ predicts the next (immediate) reward, e.g. </p>
</li>
</ul>
<script type="math/tex; mode=display">
P^a_{ss'} = \mathbb{P}[S_{t+1} = s' | S_t = s, A_t = a] \\
R^a_s = \mathbb{E} [R_{t+1} | S_t = s, A_t = a]</script><ul>
<li>Agent may have an internal model of the environment </li>
<li>Dynamics: how actions change the state </li>
<li>Rewards: how much reward from each state </li>
<li>The model may be imperfect </li>
<li>Grid layout represents transition model $P^a_{ss’}$ </li>
<li>Numbers represent immediate reward $R^a_s$ from each state $s$ (same for all $a$)</li>
</ul>
<h3 id="Categorizing-RL-Agents"><a href="#Categorizing-RL-Agents" class="headerlink" title="Categorizing RL Agents"></a>Categorizing RL Agents</h3><div style="width:70%; margin:auto">
<img src="/hclearn/2020/07/19/ucl-rl-01/category_rl_agent.png" title="RL agent 分类">
</div>

<ul>
<li>Value Based <ul>
<li><em>No Policy (Implicit)</em> </li>
<li>Value Function </li>
</ul>
</li>
<li>Policy Based <ul>
<li>Policy </li>
<li><em>No Value Function</em> </li>
</ul>
</li>
<li>Actor Critic <ul>
<li>Policy </li>
<li>Value Function</li>
</ul>
</li>
<li>Model Free <ul>
<li>Policy and/or Value Function </li>
<li><em>No Model</em> </li>
</ul>
</li>
<li>Model Based <ul>
<li>Policy and/or Value Function </li>
<li>Model</li>
</ul>
</li>
</ul>
<h2 id="Porblems-within-RL"><a href="#Porblems-within-RL" class="headerlink" title="Porblems within RL"></a>Porblems within RL</h2><h3 id="Learning-and-Planning"><a href="#Learning-and-Planning" class="headerlink" title="Learning and Planning"></a>Learning and Planning</h3><p>Two fundamental problems in sequential decision making </p>
<ul>
<li>Reinforcement Learning: <ul>
<li>The environment is initially unknown </li>
<li>The agent interacts with the environment </li>
<li>The agent improves its policy </li>
</ul>
</li>
<li>Planning: <ul>
<li>A model of the environment is known </li>
<li>The agent performs computations with its model (without any external interaction) </li>
<li>The agent improves its policy </li>
<li>a.k.a. deliberation, reasoning, introspection, pondering, thought, search</li>
</ul>
</li>
</ul>
<h3 id="Exploration-and-Exploitation"><a href="#Exploration-and-Exploitation" class="headerlink" title="Exploration and Exploitation"></a>Exploration and Exploitation</h3><ul>
<li>Reinforcement learning is like trial-and-error learning </li>
<li>The agent should discover a good policy </li>
<li>From its experiences of the environment </li>
<li>Without losing too much reward along the way</li>
<li>Exploration finds more information about the environment </li>
<li>Exploitation exploits known information to maximise reward </li>
<li>It is usually important to explore as well as exploit</li>
</ul>
<h3 id="Prediction-and-Control"><a href="#Prediction-and-Control" class="headerlink" title="Prediction and Control"></a>Prediction and Control</h3><ul>
<li>Prediction: evaluate the future <ul>
<li>Given a policy </li>
</ul>
</li>
<li>Control: optimise the future <ul>
<li>Find the best policy</li>
</ul>
</li>
</ul>
<div class="note success"><p><a href="https://gitee.com/guyuecanhui/hclearn/2020/07/18/ucl-rl/" target="_blank" rel="noopener">课程首页</a></p>
<ul>
<li>Lecture 1: <a href="https://gitee.com/guyuecanhui/hclearn/2020/07/19/ucl-rl-01/" target="_blank" rel="noopener">Introduction to Reinforcement Learning</a></li>
<li>Lecture 2: Markov Decision Processes</li>
<li>Lecture 3: Planning by Dynamic Programming</li>
<li>Lecture 4: Model-Free Prediction</li>
<li>Lecture 5: Model-Free Control</li>
<li>Lecture 6: Value Function Approximation</li>
<li>Lecture 7: Policy Gradient Methods</li>
<li>Lecture 8: Integrating Learning and Planning</li>
<li>Lecture 9: Exploration and Exploitation</li>
<li>Lecture 10: Case Study: RL in Classic Games</li>
</ul></div>
      
    </div>
    
    
    

    

    

    

    <footer class="post-footer">
      
        <div class="post-tags">
          
            <a href="/hclearn/tags/agent/" rel="tag"># agent</a>
          
            <a href="/hclearn/tags/state/" rel="tag"># state</a>
          
            <a href="/hclearn/tags/env/" rel="tag"># env</a>
          
        </div>
      

      
      
      

      
        <div class="post-nav">
          <div class="post-nav-next post-nav-item">
            
              <a href="/hclearn/2020/07/18/ucl-rl/" rel="next" title="UCL Course on RL">
                <i class="fa fa-chevron-left"></i> UCL Course on RL
              </a>
            
          </div>

          <span class="post-nav-divider"></span>

          <div class="post-nav-prev post-nav-item">
            
          </div>
        </div>
      

      
      
    </footer>
  </div>
  
  
  
  </article>



    <div class="post-spread">
      
    </div>
  </div>


          </div>
          


          

  
    <div class="comments" id="comments">
    </div>
  



        </div>
        
          
  
  <div class="sidebar-toggle">
    <div class="sidebar-toggle-line-wrap">
      <span class="sidebar-toggle-line sidebar-toggle-line-first"></span>
      <span class="sidebar-toggle-line sidebar-toggle-line-middle"></span>
      <span class="sidebar-toggle-line sidebar-toggle-line-last"></span>
    </div>
  </div>

  <aside id="sidebar" class="sidebar">
    
    <div class="sidebar-inner">

      

      
        <ul class="sidebar-nav motion-element">
          <li class="sidebar-nav-toc sidebar-nav-active" data-target="post-toc-wrap">
            文章目录
          </li>
          <li class="sidebar-nav-overview" data-target="site-overview-wrap">
            站点概览
          </li>
        </ul>
      

      <section class="site-overview-wrap sidebar-panel">
        <div class="site-overview">
          <div class="site-author motion-element" itemprop="author" itemscope itemtype="http://schema.org/Person">
            
              <p class="site-author-name" itemprop="name">古月残辉</p>
              <p class="site-description motion-element" itemprop="description">Standing on the shoulders of giants</p>
          </div>

          <nav class="site-state motion-element">

            
              <div class="site-state-item site-state-posts">
              
                <a href="/hclearn/archives/">
              
                  <span class="site-state-item-count">2</span>
                  <span class="site-state-item-name">日志</span>
                </a>
              </div>
            

            
              
              
              <div class="site-state-item site-state-categories">
                <a href="/hclearn/categories/index.html">
                  <span class="site-state-item-count">1</span>
                  <span class="site-state-item-name">分类</span>
                </a>
              </div>
            

            
              
              
              <div class="site-state-item site-state-tags">
                <a href="/hclearn/tags/index.html">
                  <span class="site-state-item-count">5</span>
                  <span class="site-state-item-name">标签</span>
                </a>
              </div>
            

          </nav>

          
            <div class="feed-link motion-element">
              <a href="/hclearn/atom.xml" rel="alternate">
                <i class="fa fa-rss"></i>
                RSS
              </a>
            </div>
          

          
            <div class="links-of-author motion-element">
                
                  <span class="links-of-author-item">
                    <a href="mailto:guyuecanhui@icloud.com" target="_blank" title="E-Mail">
                      
                        <i class="fa fa-fw fa-envelope"></i>E-Mail</a>
                  </span>
                
            </div>
          

          
          

          
          

          

        </div>
      </section>

      
      <!--noindex-->
        <section class="post-toc-wrap motion-element sidebar-panel sidebar-panel-active">
          <div class="post-toc">

            
              
            

            
              <div class="post-toc-content"><ol class="nav"><li class="nav-item nav-level-2"><a class="nav-link" href="#Reward"><span class="nav-number">1.</span> <span class="nav-text">Reward</span></a><ol class="nav-child"><li class="nav-item nav-level-3"><a class="nav-link" href="#Sequential-Decision-Making"><span class="nav-number">1.1.</span> <span class="nav-text">Sequential Decision Making</span></a></li></ol></li><li class="nav-item nav-level-2"><a class="nav-link" href="#Agent-and-Environment"><span class="nav-number">2.</span> <span class="nav-text">Agent and Environment</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#State"><span class="nav-number">3.</span> <span class="nav-text">State</span></a><ol class="nav-child"><li class="nav-item nav-level-3"><a class="nav-link" href="#History-and-State"><span class="nav-number">3.1.</span> <span class="nav-text">History and State</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#Environment-State"><span class="nav-number">3.2.</span> <span class="nav-text">Environment State</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#Agent-State"><span class="nav-number">3.3.</span> <span class="nav-text">Agent State</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#Information-State"><span class="nav-number">3.4.</span> <span class="nav-text">Information State</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#Fully-Observable-Environments"><span class="nav-number">3.5.</span> <span class="nav-text">Fully Observable Environments</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#Partially-Observable-Environments"><span class="nav-number">3.6.</span> <span class="nav-text">Partially Observable Environments</span></a></li></ol></li><li class="nav-item nav-level-2"><a class="nav-link" href="#RL-Agent"><span class="nav-number">4.</span> <span class="nav-text">RL Agent</span></a><ol class="nav-child"><li class="nav-item nav-level-3"><a class="nav-link" href="#Policy"><span class="nav-number">4.1.</span> <span class="nav-text">Policy</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#Value-Function"><span class="nav-number">4.2.</span> <span class="nav-text">Value Function</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#Model"><span class="nav-number">4.3.</span> <span class="nav-text">Model</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#Categorizing-RL-Agents"><span class="nav-number">4.4.</span> <span class="nav-text">Categorizing RL Agents</span></a></li></ol></li><li class="nav-item nav-level-2"><a class="nav-link" href="#Porblems-within-RL"><span class="nav-number">5.</span> <span class="nav-text">Porblems within RL</span></a><ol class="nav-child"><li class="nav-item nav-level-3"><a class="nav-link" href="#Learning-and-Planning"><span class="nav-number">5.1.</span> <span class="nav-text">Learning and Planning</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#Exploration-and-Exploitation"><span class="nav-number">5.2.</span> <span class="nav-text">Exploration and Exploitation</span></a></li><li class="nav-item nav-level-3"><a class="nav-link" href="#Prediction-and-Control"><span class="nav-number">5.3.</span> <span class="nav-text">Prediction and Control</span></a></li></ol></li></ol></div>
            

          </div>
        </section>
      <!--/noindex-->
      

      

    </div>
  </aside>


        
      </div>
    </main>

    <footer id="footer" class="footer">
      <div class="footer-inner">
        <div class="copyright">&copy; 2018 &mdash; <span itemprop="copyrightYear">2020</span>
  <span class="with-love">
    <i class="fa fa-user"></i>
  </span>
  <span class="author" itemprop="copyrightHolder">古月残辉</span>

  
    <span class="post-meta-divider">|</span>
    <span class="post-meta-item-icon">
      <i class="fa fa-area-chart"></i>
    </span>
    
      <span class="post-meta-item-text">Site words total count&#58;</span>
    
    <span title="Site words total count">1.1k</span>
  
</div>









<script async src="//busuanzi.ibruce.info/busuanzi/2.3/busuanzi.pure.mini.js"></script>

        
<div class="busuanzi-count">
  <script async src="https://dn-lbstatics.qbox.me/busuanzi/2.3/busuanzi.pure.mini.js"></script>

  
    <span class="site-uv">
      <i class="fa fa-user"></i> 访问人数
      <span class="busuanzi-value" id="busuanzi_value_site_uv"></span>
      人
    </span>
  

  
    <span class="site-pv">
      <i class="fa fa-eye"></i> 总访问量
      <span class="busuanzi-value" id="busuanzi_value_site_pv"></span>
      次
    </span>
  
</div>








        
      </div>
    </footer>

    
      <div class="back-to-top">
        <i class="fa fa-arrow-up"></i>
        
      </div>
    

    

  </div>

  

<script type="text/javascript">
  if (Object.prototype.toString.call(window.Promise) !== '[object Function]') {
    window.Promise = null;
  }
</script>









  












  
  
    <script type="text/javascript" src="/hclearn/lib/jquery/index.js?v=2.1.3"></script>
  

  
  
    <script type="text/javascript" src="/hclearn/lib/fastclick/lib/fastclick.min.js?v=1.0.6"></script>
  

  
  
    <script type="text/javascript" src="/hclearn/lib/jquery_lazyload/jquery.lazyload.js?v=1.9.7"></script>
  

  
  
    <script type="text/javascript" src="/hclearn/lib/velocity/velocity.min.js?v=1.2.1"></script>
  

  
  
    <script type="text/javascript" src="/hclearn/lib/velocity/velocity.ui.min.js?v=1.2.1"></script>
  

  
  
    <script type="text/javascript" src="/hclearn/lib/fancybox/source/jquery.fancybox.pack.js?v=2.1.5"></script>
  


  


  <script type="text/javascript" src="/hclearn/js/src/utils.js?v=5.1.4"></script>

  <script type="text/javascript" src="/hclearn/js/src/motion.js?v=5.1.4"></script>



  
  


  <script type="text/javascript" src="/hclearn/js/src/affix.js?v=5.1.4"></script>

  <script type="text/javascript" src="/hclearn/js/src/schemes/pisces.js?v=5.1.4"></script>



  
  <script type="text/javascript" src="/hclearn/js/src/scrollspy.js?v=5.1.4"></script>
<script type="text/javascript" src="/hclearn/js/src/post-details.js?v=5.1.4"></script>



  


  <script type="text/javascript" src="/hclearn/js/src/bootstrap.js?v=5.1.4"></script>



  


  




	





  





  










  <script src="//cdn1.lncld.net/static/js/3.0.4/av-min.js"></script>
  <script src="//unpkg.com/valine/dist/Valine.min.js"></script>
  
  <script type="text/javascript">
    var GUEST = ['nick','mail','link'];
    var guest = 'nick,mail,link';
    guest = guest.split(',').filter(item=>{
      return GUEST.indexOf(item)>-1;
    });
    new Valine({
        el: '#comments' ,
        verify: false,
        notify: false,
        appId: '6du4Ppc2TvUuhcccRHSDNH2v-gzGzoHsz',
        appKey: 'zOKNml4W1Bq3OTzEuLt5hUjI',
        placeholder: '感谢阅读！欢迎评论！',
        avatar:'mm',
        guest_info:guest,
        pageSize:'10' || 10,
    });
  </script>



  

  <script type="text/javascript">
    // Popup Window;
    var isfetched = false;
    var isXml = true;
    // Search DB path;
    var search_path = "search.xml";
    if (search_path.length === 0) {
      search_path = "search.xml";
    } else if (/json$/i.test(search_path)) {
      isXml = false;
    }
    var path = "/hclearn/" + search_path;
    // monitor main search box;

    var onPopupClose = function (e) {
      $('.popup').hide();
      $('#local-search-input').val('');
      $('.search-result-list').remove();
      $('#no-result').remove();
      $(".local-search-pop-overlay").remove();
      $('body').css('overflow', '');
    }

    function proceedsearch() {
      $("body")
        .append('<div class="search-popup-overlay local-search-pop-overlay"></div>')
        .css('overflow', 'hidden');
      $('.search-popup-overlay').click(onPopupClose);
      $('.popup').toggle();
      var $localSearchInput = $('#local-search-input');
      $localSearchInput.attr("autocapitalize", "none");
      $localSearchInput.attr("autocorrect", "off");
      $localSearchInput.focus();
    }

    // search function;
    var searchFunc = function(path, search_id, content_id) {
      'use strict';

      // start loading animation
      $("body")
        .append('<div class="search-popup-overlay local-search-pop-overlay">' +
          '<div id="search-loading-icon">' +
          '<i class="fa fa-spinner fa-pulse fa-5x fa-fw"></i>' +
          '</div>' +
          '</div>')
        .css('overflow', 'hidden');
      $("#search-loading-icon").css('margin', '20% auto 0 auto').css('text-align', 'center');

      $.ajax({
        url: path,
        dataType: isXml ? "xml" : "json",
        async: true,
        success: function(res) {
          // get the contents from search data
          isfetched = true;
          $('.popup').detach().appendTo('.header-inner');
          var datas = isXml ? $("entry", res).map(function() {
            return {
              title: $("title", this).text(),
              content: $("content",this).text(),
              url: $("url" , this).text()
            };
          }).get() : res;
          var input = document.getElementById(search_id);
          var resultContent = document.getElementById(content_id);
          var inputEventFunction = function() {
            var searchText = input.value.trim().toLowerCase();
            var keywords = searchText.split(/[\s\-]+/);
            if (keywords.length > 1) {
              keywords.push(searchText);
            }
            var resultItems = [];
            if (searchText.length > 0) {
              // perform local searching
              datas.forEach(function(data) {
                var isMatch = false;
                var hitCount = 0;
                var searchTextCount = 0;
                var title = data.title.trim();
                var titleInLowerCase = title.toLowerCase();
                var content = data.content.trim().replace(/<[^>]+>/g,"");
                var contentInLowerCase = content.toLowerCase();
                var articleUrl = decodeURIComponent(data.url);
                var indexOfTitle = [];
                var indexOfContent = [];
                // only match articles with not empty titles
                if(title != '') {
                  keywords.forEach(function(keyword) {
                    function getIndexByWord(word, text, caseSensitive) {
                      var wordLen = word.length;
                      if (wordLen === 0) {
                        return [];
                      }
                      var startPosition = 0, position = [], index = [];
                      if (!caseSensitive) {
                        text = text.toLowerCase();
                        word = word.toLowerCase();
                      }
                      while ((position = text.indexOf(word, startPosition)) > -1) {
                        index.push({position: position, word: word});
                        startPosition = position + wordLen;
                      }
                      return index;
                    }

                    indexOfTitle = indexOfTitle.concat(getIndexByWord(keyword, titleInLowerCase, false));
                    indexOfContent = indexOfContent.concat(getIndexByWord(keyword, contentInLowerCase, false));
                  });
                  if (indexOfTitle.length > 0 || indexOfContent.length > 0) {
                    isMatch = true;
                    hitCount = indexOfTitle.length + indexOfContent.length;
                  }
                }

                // show search results

                if (isMatch) {
                  // sort index by position of keyword

                  [indexOfTitle, indexOfContent].forEach(function (index) {
                    index.sort(function (itemLeft, itemRight) {
                      if (itemRight.position !== itemLeft.position) {
                        return itemRight.position - itemLeft.position;
                      } else {
                        return itemLeft.word.length - itemRight.word.length;
                      }
                    });
                  });

                  // merge hits into slices

                  function mergeIntoSlice(text, start, end, index) {
                    var item = index[index.length - 1];
                    var position = item.position;
                    var word = item.word;
                    var hits = [];
                    var searchTextCountInSlice = 0;
                    while (position + word.length <= end && index.length != 0) {
                      if (word === searchText) {
                        searchTextCountInSlice++;
                      }
                      hits.push({position: position, length: word.length});
                      var wordEnd = position + word.length;

                      // move to next position of hit

                      index.pop();
                      while (index.length != 0) {
                        item = index[index.length - 1];
                        position = item.position;
                        word = item.word;
                        if (wordEnd > position) {
                          index.pop();
                        } else {
                          break;
                        }
                      }
                    }
                    searchTextCount += searchTextCountInSlice;
                    return {
                      hits: hits,
                      start: start,
                      end: end,
                      searchTextCount: searchTextCountInSlice
                    };
                  }

                  var slicesOfTitle = [];
                  if (indexOfTitle.length != 0) {
                    slicesOfTitle.push(mergeIntoSlice(title, 0, title.length, indexOfTitle));
                  }

                  var slicesOfContent = [];
                  while (indexOfContent.length != 0) {
                    var item = indexOfContent[indexOfContent.length - 1];
                    var position = item.position;
                    var word = item.word;
                    // cut out 100 characters
                    var start = position - 20;
                    var end = position + 80;
                    if(start < 0){
                      start = 0;
                    }
                    if (end < position + word.length) {
                      end = position + word.length;
                    }
                    if(end > content.length){
                      end = content.length;
                    }
                    slicesOfContent.push(mergeIntoSlice(content, start, end, indexOfContent));
                  }

                  // sort slices in content by search text's count and hits' count

                  slicesOfContent.sort(function (sliceLeft, sliceRight) {
                    if (sliceLeft.searchTextCount !== sliceRight.searchTextCount) {
                      return sliceRight.searchTextCount - sliceLeft.searchTextCount;
                    } else if (sliceLeft.hits.length !== sliceRight.hits.length) {
                      return sliceRight.hits.length - sliceLeft.hits.length;
                    } else {
                      return sliceLeft.start - sliceRight.start;
                    }
                  });

                  // select top N slices in content

                  var upperBound = parseInt('1');
                  if (upperBound >= 0) {
                    slicesOfContent = slicesOfContent.slice(0, upperBound);
                  }

                  // highlight title and content

                  function highlightKeyword(text, slice) {
                    var result = '';
                    var prevEnd = slice.start;
                    slice.hits.forEach(function (hit) {
                      result += text.substring(prevEnd, hit.position);
                      var end = hit.position + hit.length;
                      result += '<b class="search-keyword">' + text.substring(hit.position, end) + '</b>';
                      prevEnd = end;
                    });
                    result += text.substring(prevEnd, slice.end);
                    return result;
                  }

                  var resultItem = '';

                  if (slicesOfTitle.length != 0) {
                    resultItem += "<li><a href='" + articleUrl + "' class='search-result-title'>" + highlightKeyword(title, slicesOfTitle[0]) + "</a>";
                  } else {
                    resultItem += "<li><a href='" + articleUrl + "' class='search-result-title'>" + title + "</a>";
                  }

                  slicesOfContent.forEach(function (slice) {
                    resultItem += "<a href='" + articleUrl + "'>" +
                      "<p class=\"search-result\">" + highlightKeyword(content, slice) +
                      "...</p>" + "</a>";
                  });

                  resultItem += "</li>";
                  resultItems.push({
                    item: resultItem,
                    searchTextCount: searchTextCount,
                    hitCount: hitCount,
                    id: resultItems.length
                  });
                }
              })
            };
            if (keywords.length === 1 && keywords[0] === "") {
              resultContent.innerHTML = '<div id="no-result"><i class="fa fa-search fa-5x" /></div>'
            } else if (resultItems.length === 0) {
              resultContent.innerHTML = '<div id="no-result"><i class="fa fa-frown-o fa-5x" /></div>'
            } else {
              resultItems.sort(function (resultLeft, resultRight) {
                if (resultLeft.searchTextCount !== resultRight.searchTextCount) {
                  return resultRight.searchTextCount - resultLeft.searchTextCount;
                } else if (resultLeft.hitCount !== resultRight.hitCount) {
                  return resultRight.hitCount - resultLeft.hitCount;
                } else {
                  return resultRight.id - resultLeft.id;
                }
              });
              var searchResultList = '<ul class=\"search-result-list\">';
              resultItems.forEach(function (result) {
                searchResultList += result.item;
              })
              searchResultList += "</ul>";
              resultContent.innerHTML = searchResultList;
            }
          }

          if ('auto' === 'auto') {
            input.addEventListener('input', inputEventFunction);
          } else {
            $('.search-icon').click(inputEventFunction);
            input.addEventListener('keypress', function (event) {
              if (event.keyCode === 13) {
                inputEventFunction();
              }
            });
          }

          // remove loading animation
          $(".local-search-pop-overlay").remove();
          $('body').css('overflow', '');

          proceedsearch();
        }
      });
    }

    // handle and trigger popup window;
    $('.popup-trigger').click(function(e) {
      e.stopPropagation();
      if (isfetched === false) {
        searchFunc(path, 'local-search-input', 'local-search-result');
      } else {
        proceedsearch();
      };
    });

    $('.popup-btn-close').click(onPopupClose);
    $('.popup').click(function(e){
      e.stopPropagation();
    });
    $(document).on('keyup', function (event) {
      var shouldDismissSearchPopup = event.which === 27 &&
        $('.search-popup').is(':visible');
      if (shouldDismissSearchPopup) {
        onPopupClose();
      }
    });
  </script>





  

  

  

  
  

  
  
    <script type="text/x-mathjax-config">
      MathJax.Hub.Config({
        tex2jax: {
          inlineMath: [ ['$','$'], ["\\(","\\)"]  ],
          processEscapes: true,
          skipTags: ['script', 'noscript', 'style', 'textarea', 'pre', 'code']
        }
      });
    </script>

    <script type="text/x-mathjax-config">
      MathJax.Hub.Queue(function() {
        var all = MathJax.Hub.getAllJax(), i;
        for (i=0; i < all.length; i += 1) {
          all[i].SourceElement().parentNode.className += ' has-jax';
        }
      });
    </script>
    <script type="text/javascript" src="//cdn.bootcss.com/mathjax/2.7.1/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
  


  

  

</body>
</html>
